pacman::p_load(dplyr, ggplot2, readr, haven, broom, purrr, tidyr, magrittr, labelled, sjPlot, viridis, forcats, ggthemes, cluster, factoextra, fpc)
ggplot2::theme_set(ggthemes::theme_few())
ess <- get(load("data/Rdata/ess_final.Rdata")) %>%
group_by(country) %>%
mutate(id = paste0(iso2, 1:n())) %>%
ungroup()
First we explore the diemnionality of the given trust items by applying standard pairwise scatterplots for euch variable combination and principal component analysis (efa) gain information on the distribution of variance in the data. After that conformatory factor analysis scores are computed for one trust dimesnion and merged for later analysis.
trust_data_id <- ess %>%
select(id, contains("trust")) %>%
na.omit()
trust_data <- trust_data_id %>% select(-id)
ggsave(trust_cor, filename = "images/trust_cor.png", height = 5, width = 7)
plot: [1,1] [=-----------------------------------------] 3% est: 0s
plot: [1,2] [==----------------------------------------] 6% est: 6s
plot: [1,3] [====--------------------------------------] 8% est: 6s
plot: [1,4] [=====-------------------------------------] 11% est: 6s
plot: [1,5] [======------------------------------------] 14% est: 6s
plot: [1,6] [=======-----------------------------------] 17% est: 6s
plot: [2,1] [========----------------------------------] 19% est: 6s
plot: [2,2] [=========---------------------------------] 22% est: 8s
plot: [2,3] [==========--------------------------------] 25% est: 9s
plot: [2,4] [============------------------------------] 28% est: 8s
plot: [2,5] [=============-----------------------------] 31% est: 8s
plot: [2,6] [==============----------------------------] 33% est: 7s
plot: [3,1] [===============---------------------------] 36% est: 6s
plot: [3,2] [================--------------------------] 39% est: 7s
plot: [3,3] [==================------------------------] 42% est: 8s
plot: [3,4] [===================-----------------------] 44% est: 7s
plot: [3,5] [====================----------------------] 47% est: 7s
plot: [3,6] [=====================---------------------] 50% est: 6s
plot: [4,1] [======================--------------------] 53% est: 6s
plot: [4,2] [=======================-------------------] 56% est: 6s
plot: [4,3] [========================------------------] 58% est: 5s
plot: [4,4] [==========================----------------] 61% est: 5s
plot: [4,5] [===========================---------------] 64% est: 5s
plot: [4,6] [============================--------------] 67% est: 4s
plot: [5,1] [=============================-------------] 69% est: 4s
plot: [5,2] [==============================------------] 72% est: 4s
plot: [5,3] [================================----------] 75% est: 3s
plot: [5,4] [=================================---------] 78% est: 3s
plot: [5,5] [==================================--------] 81% est: 3s
plot: [5,6] [===================================-------] 83% est: 2s
plot: [6,1] [====================================------] 86% est: 2s
plot: [6,2] [=====================================-----] 89% est: 2s
plot: [6,3] [======================================----] 92% est: 1s
plot: [6,4] [========================================--] 94% est: 1s
plot: [6,5] [=========================================-] 97% est: 0s
plot: [6,6] [==========================================]100% est: 0s
fit_pca1 <- trust_data %>%
scale() %>%
prcomp()
ggsave(pca1_vis, filename = "images/pca1_vis.pdf")
Saving 7 x 7 in image
# Contributions of variables to PC1
trust_contr <- fviz_contrib(fit_pca1, choice = "var", axes = 1, top = 10, fill = "grey", color = NA)
### Screeplot: Eigenvalues
trust_scree <- fviz_eig(fit_pca1, addlabels = T,barfill = "grey", barcolor = NA)
library(gridExtra)
Attache Paket: ‘gridExtra’
The following object is masked from ‘package:dplyr’:
combine
trust_eval <- grid.arrange(trust_scree, trust_contr, ncol = 2)
#ggsave(trust_eval, filename = "images/trust_eval.pdf", width = 10, height = 5)
And again we begin by exploring the diemnionality of immigration items by applying standard pairwise scatterplots for euch variable combination and principal component analysis to gain information on the distribution of variance in the data. After that the PC scores are extracted for the first immigration dimension and merged for later analysis.
imm_data_id <- ess %>%
select(id, contains("imm")) %>%
mutate(imm_econ = (imm_econ-10)*-1) %>%
na.omit()
# summary(imm_data_id$imm_econ)
#(imm_data_id$imm_econ - 10)*-1
imm_data <- imm_data_id %>% select(-id)
ggsave(imm_cor, filename = "images/imm_cor.png", height = 4, width = 6)
plot: [1,1] [===---------------------------------------] 6% est: 0s
plot: [1,2] [=====-------------------------------------] 12% est: 3s
plot: [1,3] [========----------------------------------] 19% est: 3s
plot: [1,4] [==========--------------------------------] 25% est: 3s
plot: [2,1] [=============-----------------------------] 31% est: 2s
plot: [2,2] [================--------------------------] 38% est: 3s
plot: [2,3] [==================------------------------] 44% est: 3s
plot: [2,4] [=====================---------------------] 50% est: 2s
plot: [3,1] [========================------------------] 56% est: 2s
plot: [3,2] [==========================----------------] 62% est: 2s
plot: [3,3] [=============================-------------] 69% est: 2s
plot: [3,4] [================================----------] 75% est: 2s
plot: [4,1] [==================================--------] 81% est: 1s
plot: [4,2] [=====================================-----] 88% est: 1s
plot: [4,3] [=======================================---] 94% est: 0s
plot: [4,4] [==========================================]100% est: 0s
fit_pca2 <- imm_data %>%
scale() %>%
prcomp()
ggsave(pca2_vis, filename = "images/pca2_vis.pdf")
Saving 7 x 7 in image
# Contributions of variables to PC1
imm_contr <- fviz_contrib(fit_pca2, choice = "var", axes = 1, top = 10, fill = "grey", color = NA)
### Screeplot: Eigenvalues
imm_scree <- fviz_eig(fit_pca2, addlabels = T,barfill = "grey", barcolor = NA)
library(gridExtra)
imm_eval <- grid.arrange(imm_scree, imm_contr, ncol = 2)
#ggsave(imm_eval, filename = "images/imm_eval.pdf", width = 10, height = 5)
glimpse(ess)
Observations: 57,937
Variables: 38
$ id <chr> "AT1", "AT2", "AT3", "AT4", "AT5", "AT6", "AT7...
$ round <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8...
$ region <chr> "AT13", "AT13", "AT31", "AT13", "AT22", "AT21"...
$ gndr <dbl> 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 1...
$ age <dbl> 1982, 1964, 1948, 1962, 1996, 1951, 1964, 1972...
$ income <dbl> NA, 5, 2, 4, 2, 10, 2, 8, 3, 4, 1, 3, 5, NA, 3...
$ pol_inter <dbl> 1, 1, 3, 2, 3, 2, 3, 3, 4, 2, 3, 1, 2, 3, 3, 3...
$ lrscale <dbl> 0, 1, 5, 0, 5, 5, 4, 5, 5, 5, 5, 8, 8, 5, NA, ...
$ edu_year <dbl> 21, 16, 13, 12, 13, 13, 8, 17, 14, 16, 9, 11, ...
$ edu <dbl> 720, 313, 322, 322, 322, 313, 212, 423, 322, 3...
$ rel <dbl> 7, 7, 6, 6, 5, 5, 5, 5, 7, 5, 3, 7, 4, 5, 5, 5...
$ have_say <dbl> 2, 1, 2, 3, 3, 2, 2, 2, 2, 3, 1, 2, 2, 2, 1, 4...
$ have_infl <dbl> 2, 2, 2, 3, 4, 2, 2, 2, 2, 3, 1, 2, 3, 3, 1, 2...
$ pol_grp <dbl> 2, 3, 2, 2, 3, 2, 1, 2, 2, 2, 1, 3, 4, 3, 1, 3...
$ pol_conf <dbl> 3, 3, 2, 4, 1, 2, 1, 2, 2, 2, 1, 3, 1, 2, 1, 2...
$ trust_parl <dbl> 6, 5, 3, 1, 7, 4, 5, 6, 10, 7, 6, 0, 2, 6, 9, ...
$ trust_leg <dbl> 7, 4, 5, 1, 7, 6, 6, 7, 10, 8, 5, 5, 8, 7, 9, ...
$ trust_police <dbl> 0, 3, 9, 2, 10, 7, 7, 9, 10, 7, 5, 7, 7, 7, 9,...
$ trust_pol <dbl> 2, 3, 3, 1, 6, 3, 2, 5, 9, 4, 3, 0, 6, 5, 7, 8...
$ trust_party <dbl> 8, 4, 3, 1, 7, 3, 2, 3, 8, 4, 3, 0, 6, 5, 6, 8...
$ trust_eu <dbl> 9, 6, 4, 1, 9, 2, NA, 4, 8, 1, 3, 0, 5, 7, 6, ...
$ trstun <dbl> 10, 7, 5, 1, 10, 4, NA, 6, 10, 0, NA, 0, 4, 6,...
$ s_life <dbl> 5, 5, 9, 7, 10, 7, 5, 9, 10, 7, 8, 7, 6, 8, 3,...
$ s_econ <dbl> 4, 1, 6, 5, 7, 3, 5, 6, 10, 5, 0, 5, 6, 5, 5, ...
$ s_gov <dbl> 4, 3, 3, 6, 5, 5, 6, 5, 10, 3, 4, 0, 2, 4, 6, ...
$ s_dem <dbl> 6, 3, 6, 6, 9, 5, 7, 8, 10, 7, 5, 3, 8, 5, 8, ...
$ state_edu <dbl> 3, 0, 8, 7, 10, 4, 4, 7, 9, 8, 10, 4, 5, 6, 3,...
$ sate_health <dbl> 5, 3, 6, 6, 10, 5, 9, 8, 10, 8, 10, 5, 7, 7, 9...
$ imm_econ <dbl> 10, 8, 5, 7, 2, 8, 6, 5, 9, 7, 5, 1, 2, 5, 0, ...
$ imm_same <dbl> 1, 1, 2, NA, 2, 1, 1, 3, 1, 2, 3, 4, 2, 2, 2, ...
$ imm_diff <dbl> 1, 1, 3, NA, 2, 1, 3, 3, 1, 2, 3, 4, 4, 2, 4, ...
$ imm_poor <dbl> 1, 1, 3, NA, 1, 1, 3, 3, 2, 2, 3, 4, 4, 3, 3, ...
$ vote <chr> "", "", "SPÖ", "", "", "ÖVP", "", "ÖVP", "", "...
$ vote_id <chr> "", "", "AT_1", "", "", "AT_2", "", "AT_2", ""...
$ country <chr> "Austria", "Austria", "Austria", "Austria", "A...
$ iso2 <chr> "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT"...
$ iso3 <chr> "AUT", "AUT", "AUT", "AUT", "AUT", "AUT", "AUT...
$ party_id <dbl> NA, NA, 1301, NA, NA, 1302, NA, 1302, NA, 1302...
sat_data_id <- ess %>%
rename(state_health = sate_health) %>%
select(id, starts_with("s_"), starts_with("state_")) %>%
na.omit()
sat_data_with <- sat_data_id %>% select(-id)
sat_data_without <- sat_data_id %>% select(-id, -s_life)
library(GGally)
element_scatter <- function(data, mapping, ...) {
ggplot(data = data, mapping = mapping) +
geom_jitter(alpha = .01, color = "#3D3D3D", size = .5) +
geom_smooth(mapping = mapping, method = "lm")
}
element_hist <- function(data, mapping, ...) {
ggplot(data = data, mapping = mapping) +
geom_bar(fill = "#3D3D3D", width = 1)
}
pm3 <- ggpairs(
sat_data_with,
#columns = c("total_bill", "time", "tip"),
diag = list(continuous = wrap(element_hist)),
lower = list(
#combo = wrap("facethist", binwidth = 1),
continuous = wrap(element_scatter)
)
) + theme_few()
ggsave(pm3, filename = "images/sat_cor.png", height = 5, width = 7)
plot: [1,1] [=-----------------------------------------] 3% est: 0s
plot: [1,2] [==----------------------------------------] 6% est: 6s
plot: [1,3] [====--------------------------------------] 8% est: 6s
plot: [1,4] [=====-------------------------------------] 11% est: 5s
plot: [1,5] [======------------------------------------] 14% est: 6s
plot: [1,6] [=======-----------------------------------] 17% est: 5s
plot: [2,1] [========----------------------------------] 19% est: 5s
plot: [2,2] [=========---------------------------------] 22% est: 7s
plot: [2,3] [==========--------------------------------] 25% est: 6s
plot: [2,4] [============------------------------------] 28% est: 6s
plot: [2,5] [=============-----------------------------] 31% est: 6s
plot: [2,6] [==============----------------------------] 33% est: 5s
plot: [3,1] [===============---------------------------] 36% est: 5s
plot: [3,2] [================--------------------------] 39% est: 7s
plot: [3,3] [==================------------------------] 42% est: 7s
plot: [3,4] [===================-----------------------] 44% est: 7s
plot: [3,5] [====================----------------------] 47% est: 6s
plot: [3,6] [=====================---------------------] 50% est: 6s
plot: [4,1] [======================--------------------] 53% est: 5s
plot: [4,2] [=======================-------------------] 56% est: 5s
plot: [4,3] [========================------------------] 58% est: 6s
plot: [4,4] [==========================----------------] 61% est: 6s
plot: [4,5] [===========================---------------] 64% est: 5s
plot: [4,6] [============================--------------] 67% est: 5s
plot: [5,1] [=============================-------------] 69% est: 4s
plot: [5,2] [==============================------------] 72% est: 4s
plot: [5,3] [================================----------] 75% est: 4s
plot: [5,4] [=================================---------] 78% est: 4s
plot: [5,5] [==================================--------] 81% est: 3s
plot: [5,6] [===================================-------] 83% est: 3s
plot: [6,1] [====================================------] 86% est: 2s
plot: [6,2] [=====================================-----] 89% est: 2s
plot: [6,3] [======================================----] 92% est: 1s
plot: [6,4] [========================================--] 94% est: 1s
plot: [6,5] [=========================================-] 97% est: 0s
plot: [6,6] [==========================================]100% est: 0s
fit_pca3 <- sat_data_without %>%
scale() %>%
prcomp()
q1 <- sat_data_with %>%
scale() %>%
prcomp() %>%
fviz_pca_var(., col.var = "black")
#sjt.pca(fit_pca2, nmbr.fctr = 1)
q2 <- fviz_pca_var(fit_pca3, col.var = "black")
library(gridExtra)
pca_sat_refit <- grid.arrange(q1, q2, ncol = 2)
ggsave(pca_sat_refit, filename = "images/pca3_vis.pdf")
Saving 7.29 x 4.51 in image
# Contributions of variables to PC1
sat_contr <- fviz_contrib(fit_pca3, choice = "var", axes = 1, top = 10, fill = "grey", color = NA)
### Screeplot: Eigenvalues
sat_scree <- fviz_eig(fit_pca3, addlabels = T,barfill = "grey", barcolor = NA)
library(gridExtra)
sat_eval <- grid.arrange(sat_scree, sat_contr, ncol = 2)
ggsave(sat_eval, filename = "images/sat_eval.pdf", width = 10, height = 5)
fa_trust <- trust_data %>%
factanal(., factors = 1, scores = 'regression')
trust_data_id$trust_scores <- fa_trust$scores[,1]
trust_data_id$trust_scores %>% head
dd <- trust_data_id %>% select(id, trust_scores)
ess_score <- ess %>%
dplyr::left_join(dd)
#save(ess_score, file = "data/Rdata/ess_score.Rdata")
\[f(x_i) = \frac{1}{2\pi^{p/2}|\Sigma|^{1/2}}e^{-\frac{1}{2}(x_i-\mu)^T\Sigma^{-1} (x_i-\mu)}\]
In addition, the covariance matrix is modelled by a structure of common factors accounting for a joint component of variance and specific residual variances for each variable:
\[\Sigma = \Lambda\Lambda^T + D_\psi\]
Need to first: write out the likelihood, and then maximize it!!! Easier to start with a one-factor solution…
\[\Sigma = \lambda\lambda^T + D_\psi\]
The proportion of the variance in each original variable \(x_d\) accounted for by the first \(PC_1\) given by the sum of the squared factor loadings; that is \(\sum^c_{k=1} f^2_{ik}\). When c=p (all components are retained). \($\sum^c_{k=1} f^2_{ik}=1\) (all variance is explained)
Factor loadings are the correlations between the original variables x and the components PC, denoted as \(F = cor(x, PC) = uD^{1/2}\). Internal Validation measure to see the correlation between predictors and extracted components.
The factor loadings matrix is usually rotated or re-oriented in order to receive uncorrelated factors/ components. The goal is to find clusters of variables that are highly correlated and to large extend define only one factor.
Common factor model - observed variance in each measure is attributable to a relatively small number of common factors and a single speficif factor (uncorrelated to other factors in the model).
my opinion
\[x_{i1} = \lambda_1\xi_{i1} + \lambda_2\xi_{i2} + ... + \delta_i \]
her opinion:
\[x_{i} = \lambda_{i1}\xi_1 + \lambda_{i2}\xi_2 + ... + \delta_i \]
The common analysis is appropriate when there is a latent trait or unobservable characteristics. Used within suervey questions about attitudes. The goal is to identify common factors captering the variance from these questions and which can also be used as factor scores.
\[h_i^2 = \sum_k \lambda^2_{ik} = 1- \theta_{ik}^2\] where \(\theta_{ik}^2 = var(\delta_i)\) is the fcator uniqueness. * The solution to the common factor model is determined by orienting the first factor so that it captures the greatest possible varinace but is uncorrelated with the first factor. * The correlation between X variables and the … factors are called loadings \(\Lambda\). * The factor scores present the psoitions of the observations in the common factor space. The factpr score coefficients are given by
\[B = R^{-1}\Lambda_c\]
where R is the correlation matrix * The factor scores are calculated as:
\[\Xi = X_SB\] These factor scores are included in the data and can be used instead of the original variables.
library(ggplot2)
library(survival)
data(lung, package = "survival")
summary(ess$rel)
ess_surv <- ess
ess_surv$year <- 2017 - ess_surv$year
sf.ess <- survival::survfit(Surv(year, gndr) ~ 1, data = ess_surv)
ggsurv(sf.ess)
# Color by cos2 values: quality on the factor map
fviz_pca_var(
fit_pca1,
col.var = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
# Graph of individuals
# fviz_pca_ind(fit_pca1)
# Total cos2 of variables on Dim.1 and Dim.2
fviz_cos2(fit_pca1, choice = "var", axes = 1:2)
# Contributions of variables to PC1
fviz_contrib(fit_pca1, choice = "var", axes = 1, top = 10)
# Contributions of variables to PC2
#fviz_contrib(fit_pca1, choice = "var", axes = 2, top = 10)
# The total contribution to PC1 and PC2 is obtained with
#fviz_contrib(fit_pca1, choice = "var", axes = 1:3, top = 10)